In [1]:
%%capture
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas_profiling 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

Wifi Fingerprint Locationing¶

About the dataset


  • It covers a surface of 108703m2 including 3 buildings with 4 or 5 floors depending on the building.

  • The number of different places (reference points) appearing in the database is 933.

  • 21049 sampled points have been captured: 19938 for training/learning and 1111 for validation/testing.

  • Dataset independence has been assured by taking Validation (or testing) samples 4 months after Training ones.

  • The number of different wireless access points (WAPs) appearing in the database is 520. The intensity values are represented as negative integer values
    ranging -104dBm (extremely poor signal) to 0dbM. The positive value 100 is used to denote when a WAP was not detected.

  • Data were collected by more than 20 users using 25 different models of mobile devices (some users used more than one model).

In [2]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://www.researchgate.net/profile/Joaquin-Torres-Sospedra/publication/283894296/figure/fig7/AS:676977799331845@1538415493912/Map-of-the-UJI-Riu-Sec-Campus-and-zoom-on-the-Tx-Buildings-Pink-refers-to-the-ESTCE-Tx.png",
     width=9000, height=200)
Out[2]:
In [3]:
%%capture
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas_profiling 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
import warnings
from matplotlib.pyplot import figure
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
warnings.filterwarnings('ignore')

Data pre-processing¶

Importing our datasets and transforming the WAP values for better interpretation. "0" will represent that the WAP was not detected, and it will scale from "1" (extremely poor signal) to "105" (extremely good signal).

In [4]:
df = pd.read_csv(r'C:\Users\andre\OneDrive\Andres Marquez\UBIQUM\Project 4\Task 2\trainingData.csv')

df_wap = df.iloc[:, 0:520]

df_wap.iloc[:, 0:520] = np.where(df.iloc[:, 0:520] <= 0, 
                df.iloc[:, 0:520] + 105, 
                df.iloc[:, 0:520] - 100)

df = pd.concat([df_wap, df.iloc[:,520:]], axis=1)



validation = pd.read_csv(r'C:\Users\andre\OneDrive\Andres Marquez\UBIQUM\Project 4\Task 2\ValidationData.csv')

validation_wap = validation.iloc[:, 0:520]

validation_wap.iloc[:, 0:520] = np.where(validation.iloc[:, 0:520] <= 0, 
                validation.iloc[:, 0:520] + 105, 
                validation.iloc[:, 0:520] - 100)

validation = pd.concat([validation_wap, validation.iloc[:,520:]], axis=1)
In [5]:
df.head()
Out[5]:
WAP001 WAP002 WAP003 WAP004 WAP005 WAP006 WAP007 WAP008 WAP009 WAP010 ... WAP520 LONGITUDE LATITUDE FLOOR BUILDINGID SPACEID RELATIVEPOSITION USERID PHONEID TIMESTAMP
0 0 0 0 0 0 0 0 0 0 0 ... 0 -7541.2643 4.864921e+06 2 1 106 2 2 23 1371713733
1 0 0 0 0 0 0 0 0 0 0 ... 0 -7536.6212 4.864934e+06 2 1 106 2 2 23 1371713691
2 0 0 0 0 0 0 0 8 0 0 ... 0 -7519.1524 4.864950e+06 2 1 103 2 2 23 1371714095
3 0 0 0 0 0 0 0 0 0 0 ... 0 -7524.5704 4.864934e+06 2 1 102 2 2 23 1371713807
4 0 0 0 0 0 0 0 0 0 0 ... 0 -7632.1436 4.864982e+06 0 0 122 2 11 13 1369909710

5 rows × 529 columns

Handling duplicates.

In [6]:
df.duplicated().value_counts()
Out[6]:
False    19300
True       637
dtype: int64
In [7]:
validation.duplicated().value_counts()
Out[7]:
False    1111
dtype: int64
In [8]:
df = df.drop_duplicates()
df.duplicated().value_counts()
Out[8]:
False    19300
dtype: int64

Handling missing values.

In [9]:
print(df.isnull().any().value_counts())
print(validation.isnull().any().value_counts())
False    529
dtype: int64
False    529
dtype: int64

Transform Timestamp Unix Format into date format for better understanding.

In [10]:
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'],unit='s')

validation['TIMESTAMP'] = pd.to_datetime(validation['TIMESTAMP'],unit='s')

Given that we see the buildings in a diagonal way, we are gonna do a transormation in order to see them in a horizontal (front) way.

In [11]:
fig = px.scatter(df, x="LONGITUDE", y="LATITUDE", color='BUILDINGID')
fig.show()
In [12]:
trueNorth = 28 * np.pi / 180
df['LONGITUDE_N'] = df['LONGITUDE'] * np.cos(trueNorth) - df['LATITUDE'] * np.sin(trueNorth)
df['LATITUDE_N'] = df['LONGITUDE'] * np.sin(trueNorth) + df['LATITUDE'] * np.cos(trueNorth)

validation['LONGITUDE_N'] = validation['LONGITUDE'] * np.cos(trueNorth) - validation['LATITUDE'] * np.sin(trueNorth)
validation['LATITUDE_N'] = validation['LONGITUDE'] * np.sin(trueNorth) + validation['LATITUDE'] * np.cos(trueNorth)
In [13]:
df.insert(520, 'LONGITUDE_N', df.pop('LONGITUDE_N'))
df.insert(520, 'LATITUDE_N', df.pop('LATITUDE_N'))

validation.insert(520, 'LONGITUDE_N', validation.pop('LONGITUDE_N'))
validation.insert(520, 'LATITUDE_N', validation.pop('LATITUDE_N'))
In [14]:
fig = px.scatter(df, x="LONGITUDE_N", y="LATITUDE_N", color='BUILDINGID')
fig.show()

Now that we now we did the right trasformation we are going to replace the previous values with the new ones.

In [15]:
df.drop(['LATITUDE', 'LONGITUDE'], axis=1, inplace=True)
df.rename(columns={"LATITUDE_N": "LATITUDE", "LONGITUDE_N": "LONGITUDE"}, inplace=True)


validation.drop(['LATITUDE', 'LONGITUDE'], axis=1, inplace=True)
validation.rename(columns={"LATITUDE_N": "LATITUDE", "LONGITUDE_N": "LONGITUDE"}, inplace=True)

Let's analyse more deep into the WAPs values and see what's happening

In [16]:
waps = pd.DataFrame(df.iloc[:,:520].agg(['max']).max())
waps = waps[waps[0]==0].reset_index()
waps.drop([0], axis=1, inplace=True)
waps.rename(columns={"index": "WAP"}, inplace=True)


waps_val = pd.DataFrame(validation.iloc[:,:520].agg(['max']).max())
waps_val = waps_val[waps_val[0]==0].reset_index()
waps_val.drop([0], axis=1, inplace=True)
waps_val.rename(columns={"index": "WAP"}, inplace=True)


waps_l = waps.merge(waps_val, on='WAP')
len(waps_l)
Out[16]:
0

We could see that we have plenty of colums in both datasets with full column values of 0s, but non of them were the same between the datasets, so we can't take them off.

Let's analyze the Timestamp and see if there's something odd.

In [17]:
vc = df['TIMESTAMP'].value_counts()
print(vc[vc >= 10])
2013-06-20 08:01:27    15
2013-06-20 07:57:58    12
2013-06-20 09:21:24    12
2013-06-20 09:27:12    12
2013-06-20 07:57:57    11
2013-06-20 07:55:07    11
2013-06-20 08:01:31    10
2013-06-20 07:49:35    10
2013-06-20 10:07:06    10
2013-06-12 16:04:22    10
2013-06-20 07:42:38    10
2013-06-20 09:24:53    10
2013-06-20 07:55:08    10
Name: TIMESTAMP, dtype: int64

There's clearly something weird, way too many duplicates in the same time stamp, let's analyze it by each customer and see if any of them could be in a different place in the same time.

In [18]:
df[df.columns[-9:]].groupby(['TIMESTAMP', 'USERID']).count().sort_values(by='FLOOR',ascending=False).head()
Out[18]:
LATITUDE LONGITUDE FLOOR BUILDINGID SPACEID RELATIVEPOSITION PHONEID
TIMESTAMP USERID
2013-06-20 08:01:27 13 10 10 10 10 10 10 10
2013-06-12 16:04:22 1 10 10 10 10 10 10 10
2013-06-20 10:07:06 14 8 8 8 8 8 8 8
2013-06-20 08:09:54 14 8 8 8 8 8 8 8
2013-06-20 07:42:38 14 7 7 7 7 7 7 7

Odd Users: 13, 1, 14, 11, 9, 6, 7

In [19]:
df[(df['TIMESTAMP']=='2013-06-12 16:04:22')]
Out[19]:
WAP001 WAP002 WAP003 WAP004 WAP005 WAP006 WAP007 WAP008 WAP009 WAP010 ... WAP520 LATITUDE LONGITUDE FLOOR BUILDINGID SPACEID RELATIVEPOSITION USERID PHONEID TIMESTAMP
15955 0 0 0 0 0 0 0 0 0 0 ... 0 4.291947e+06 -2.290712e+06 2 0 130 2 1 14 2013-06-12 16:04:22
16220 0 0 0 0 0 0 0 0 0 0 ... 0 4.291941e+06 -2.290709e+06 2 0 140 2 1 14 2013-06-12 16:04:22
16488 0 0 0 0 0 0 0 0 0 0 ... 0 4.291941e+06 -2.290709e+06 2 0 140 2 1 14 2013-06-12 16:04:22
16757 0 0 0 0 0 0 0 0 0 0 ... 0 4.291941e+06 -2.290709e+06 2 0 140 2 1 14 2013-06-12 16:04:22
17035 0 0 0 0 0 0 0 0 0 0 ... 0 4.291941e+06 -2.290709e+06 2 0 140 2 1 14 2013-06-12 16:04:22
17311 0 0 0 0 0 0 0 0 0 0 ... 0 4.291941e+06 -2.290709e+06 2 0 140 2 1 14 2013-06-12 16:04:22
17587 0 0 0 0 0 0 0 0 0 0 ... 0 4.291941e+06 -2.290709e+06 2 0 140 2 1 14 2013-06-12 16:04:22
17864 0 0 0 0 0 0 0 0 0 0 ... 0 4.291941e+06 -2.290709e+06 2 0 140 2 1 14 2013-06-12 16:04:22
18139 0 0 0 0 0 0 0 0 0 0 ... 0 4.291947e+06 -2.290712e+06 2 0 130 2 1 14 2013-06-12 16:04:22
18413 0 0 0 0 0 0 0 0 0 0 ... 0 4.291947e+06 -2.290712e+06 2 0 130 2 1 14 2013-06-12 16:04:22

10 rows × 529 columns

After some analysis made in each of our users, we determine that users 13 and 1 have some odd values that cannot be trusted, therefore we'll be removing those values from our dataset.

In [20]:
L=[12168,12251]
rem1 = df.loc[L]
rem2 = df[(df['TIMESTAMP']=='2013-06-12 16:04:22')]
rows_rem = pd.concat([rem1,rem2])

df.drop(rows_rem.index.values.tolist(), axis=0, inplace=True)

Now, let's move on in our analysis, we'll be plotting the building in 3D and see if we see any anomalies in it.

In [21]:
fig = px.scatter_3d(df, x="LONGITUDE", y="LATITUDE", z='FLOOR', color = 'BUILDINGID', opacity=0.7, color_continuous_scale=px.colors.sequential.Viridis)

fig.update_traces(marker=dict(size=4))

fig.update_layout(
    width=950,
    margin=dict(r=100, l=100, b=10, t=10))

fig.show()
In [22]:
for i_b in range(len(set(df['BUILDINGID']))):
    print("building: ", i_b)
    n_f_per_b = len(set(df[df['BUILDINGID']==i_b]['FLOOR']))
    for i_f in range(n_f_per_b):
        unique_user = set(df[(df['BUILDINGID']==i_b) & (df['FLOOR']==i_f)]['USERID'])
        print("floor: ", i_f, "userID: ", unique_user)
    print("\n")
building:  0
floor:  0 userID:  {1, 11}
floor:  1 userID:  {1, 11}
floor:  2 userID:  {1, 11}
floor:  3 userID:  {1, 11}


building:  1
floor:  0 userID:  {7, 11, 13, 14, 16}
floor:  1 userID:  {7, 11, 13, 14, 16}
floor:  2 userID:  {9, 2, 10, 4}
floor:  3 userID:  {8, 9, 10, 17, 18}


building:  2
floor:  0 userID:  {8, 9, 10, 17}
floor:  1 userID:  {7, 12, 15, 16, 18}
floor:  2 userID:  {11, 5, 14}
floor:  3 userID:  {2, 5, 6, 11, 14}
floor:  4 userID:  {3, 13, 6}


It looks to be a missing corner in our 3rd building, also, the distribution in the first three floors seem a little bit odd.

In [23]:
numbers = [3,8]
fig = px.scatter_3d(df[(df['BUILDINGID']==2) & (df['USERID'].isin([7, 12, 15, 16, 18])) & (df['SPACEID']<200)], x="LONGITUDE", y="LATITUDE",
                    z='FLOOR', color= 'RELATIVEPOSITION',opacity=0.7,color_continuous_scale=px.colors.sequential.Viridis)

fig.update_traces(marker=dict(size=4))

fig.update_layout(
    width=950,
    margin=dict(r=100, l=100, b=10, t=10))

fig.show()
In [24]:
fig = px.scatter(df[(df['BUILDINGID']==2) & (df['USERID'].isin([7, 15, 12, 16, 18]))], x="LONGITUDE", y="LATITUDE",
                 color='USERID', color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(height=400, width=800, title_text="Including 12")

fig2 = px.scatter(df[(df['BUILDINGID']==2) & (df['USERID'].isin([7, 15, 16, 18]))], x="LONGITUDE", y="LATITUDE",
                  color='USERID', color_continuous_scale=px.colors.sequential.Viridis)
fig2.update_layout(height=400, width=800, title_text="Excluding 12")

fig.show()
fig2.show()

It seems like our User #12 has no impact in one of the sides of the building, curiously, that's the side we are missing to be complete in our 5th floor, so let's plot it including it and see how it looks.

In [25]:
fig = px.scatter(df[(df['BUILDINGID']==2) & (df['USERID'].isin([12, 3])) & (df['SPACEID']>=200)], x="LONGITUDE", y="LATITUDE",
                 color='USERID', color_continuous_scale=px.colors.sequential.Viridis)

fig.show()
In [26]:
df[(df['BUILDINGID']==2)]['FLOOR'].value_counts()
Out[26]:
3    2708
1    2162
0    1942
2    1577
4     727
Name: FLOOR, dtype: int64

Interesting take, it seems like one of the sides that our User #12 recorded doesn't belong to the FloorId #1 but to the FloorId #4, given that we can't ask the company and validate this information, all we can do is an assumption. We believe by our analysis that this is true, there was a problem with the user #12 during the recording of one of the sides of the floors and it was assigned to the 2nd floor when it should've been assigned to the 5th floors. However, given that we cannot validate this information, we'll keep the situation as it is and keep with our analysis because we can't change the situation only by that analysis and assumption.

In [27]:
#df1 = df[(df['BUILDINGID']==2) & (df['USERID'].isin([12])) & (df['SPACEID']>=200)]
#df1['FLOOR'] = df1['FLOOR'].replace(1,4)
#df.loc[df1.index.values.tolist()] = df1
In [28]:
#fig = px.scatter_3d(df[(df['BUILDINGID']==2)], x="LONGITUDE", y="LATITUDE", z='FLOOR', color = 'FLOOR', opacity=0.7, color_continuous_scale=px.colors.sequential.Viridis)

#fig.update_traces(marker=dict(size=4))

##fig.update_layout(
    #width=950,
    #margin=dict(r=100, l=100, b=10, t=10))

#fig.show()

Data Selection & Predictive Models¶

Given that in our validation data we don't have any information about the SpaceId and the RelativePosition, we'll not be using this variables for our further analysis and predictions. Also, there's no theory to support that timestamp and PhoneId can help us predict the location of a person, we'll be taking that variable off too.

In [29]:
X = np.asarray(df.iloc[:,0:520])
y = np.asarray(df.iloc[:,523:524])
X_val = np.asarray(validation.iloc[:,0:520])
y_val = np.asarray(validation.iloc[:,523:524])
In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
In [31]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)
In [32]:
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)
print ('Validation set:', X_val.shape,  y_val.shape)
Train set: (15430, 520) (15430, 1)
Test set: (3858, 520) (3858, 1)
Validation set: (1111, 520) (1111, 1)

Building¶

KNN¶

In [33]:
Ks = 20
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    pred_val=neigh.predict(X_val)
    mean_acc[n-1] = metrics.accuracy_score(y_val, pred_val)

    
    std_acc[n-1]=np.std(pred_val==y_val)/np.sqrt(pred_val.shape[0])

mean_acc

print( "The best accuracy was with", round(mean_acc.max(),4), "with k=", mean_acc.argmax()+1) 
The best accuracy was with 0.9937 with k= 2
In [34]:
k = mean_acc.argmax()+1
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train, y_train)

pred = neigh.predict(X_test)
pred_val = neigh.predict(X_val)

print("Train set Accuracy: ", round(metrics.accuracy_score(y_train, neigh.predict(X_train)),4))
print("Test set Accuracy: ", round(metrics.accuracy_score(y_test, pred),4))
print("Validation set Accuracy: ", round(metrics.accuracy_score(y_val, pred_val),4))
Train set Accuracy:  0.9982
Test set Accuracy:  0.9974
Validation set Accuracy:  0.9937

SVM¶

In [35]:
clf = svm.SVC(kernel='sigmoid')
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)

print("Train set Accuracy: ", round(metrics.accuracy_score(y_train, clf.predict(X_train)),4))
print("Test set Accuracy: ", round(metrics.accuracy_score(y_test, y_pred),4))
print("Validation set Accuracy: ", round(metrics.accuracy_score(y_val, y_val_pred),4))
Train set Accuracy:  0.9957
Test set Accuracy:  0.9933
Validation set Accuracy:  0.9928

Decision Trees Models¶

In [36]:
X = np.asarray(df.iloc[:,0:520])
y = np.asarray(df.iloc[:,523:524])
X_val = np.asarray(validation.iloc[:,0:520])
y_val = np.asarray(validation.iloc[:,523:524])
In [37]:
algos_Class = []
algos_Class.append(('Random Forest Classifier', RandomForestClassifier()))
algos_Class.append(('Decision Tree Classifier', DecisionTreeClassifier()))
algos_Class.append(('Gradient Boosting Classifier', GradientBoostingClassifier()))
In [38]:
results = []
names = []
for name, model in algos_Class:
    result = cross_val_score(model, X, y, cv=3, scoring='accuracy')
    names.append(name)
    results.append(result)

for i in range(len(names)):
    print(names[i],results[i].mean())
Random Forest Classifier 0.9881268303672908
Decision Tree Classifier 0.9878677577246248
Gradient Boosting Classifier 0.9910307275523259
In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)

dtc = DecisionTreeClassifier()
gbm = GradientBoostingClassifier()
rfc = RandomForestClassifier()

dtc.fit(X_train, y_train)
gbm.fit(X_train, y_train)
rfc.fit(X_train, y_train)

dtc_pred = dtc.predict(X_val)
gbm_pred = gbm.predict(X_val)
rfc_pred = rfc.predict(X_val)


print("DTC Validation set Accuracy: ", metrics.accuracy_score(y_val, dtc_pred))
print("GBM Validation set Accuracy: ", metrics.accuracy_score(y_val, gbm_pred))
print("RFC Validation set Accuracy: ", metrics.accuracy_score(y_val, rfc_pred))
DTC Validation set Accuracy:  0.9720972097209721
GBM Validation set Accuracy:  0.9981998199819982
RFC Validation set Accuracy:  0.9990999099909991

It seems that 2/3 of the decision trees models are giving us nearly perfect results when predicting the building, we'll be using those prediction and turn our attention now into predicting the floor.

Floor¶

In [40]:
validation.insert(520, 'BUILDING_PRED', rfc_pred)
df.insert(520, 'BUILDINGID', df.pop('BUILDINGID'))
In [41]:
X = np.asarray(df.iloc[:,0:521])
y = np.asarray(df.iloc[:,523:524])
X_val = np.asarray(validation.iloc[:,0:521])
y_val = np.asarray(validation.iloc[:,523:524])
In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
In [43]:
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)
In [44]:
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)
print ('Validation set:', X_val.shape,  y_val.shape)
Train set: (15430, 521) (15430, 1)
Test set: (3858, 521) (3858, 1)
Validation set: (1111, 521) (1111, 1)

KNN¶

In [45]:
Ks = 20
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    pred_val=neigh.predict(X_val)
    mean_acc[n-1] = metrics.accuracy_score(y_val, pred_val)

    
    std_acc[n-1]=np.std(pred_val==y_val)/np.sqrt(pred_val.shape[0])

mean_acc

print( "The best accuracy was with", round(mean_acc.max(),4), "with k=", mean_acc.argmax()+1) 
The best accuracy was with 0.8272 with k= 16
In [46]:
k = mean_acc.argmax()+1
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train, y_train)

pred = neigh.predict(X_test)
pred_val = neigh.predict(X_val)

print("Train set Accuracy: ", round(metrics.accuracy_score(y_train, neigh.predict(X_train)),4))
print("Test set Accuracy: ", round(metrics.accuracy_score(y_test, pred),4))
print("Validation set Accuracy: ", round(metrics.accuracy_score(y_val, pred_val),4))
Train set Accuracy:  0.9798
Test set Accuracy:  0.9705
Validation set Accuracy:  0.8272

SVM¶

In [47]:
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)

print("Train set Accuracy: ", round(metrics.accuracy_score(y_train, clf.predict(X_train)),4))
print("Test set Accuracy: ", round(metrics.accuracy_score(y_test, y_pred),4))
print("Validation set Accuracy: ", round(metrics.accuracy_score(y_val, y_val_pred),4))
Train set Accuracy:  0.9062
Test set Accuracy:  0.9046
Validation set Accuracy:  0.8434

Decision Trees Models¶

In [48]:
X = np.asarray(df.iloc[:,0:521])
y = np.asarray(df.iloc[:,523:524])
X_val = np.asarray(validation.iloc[:,0:521])
y_val = np.asarray(validation.iloc[:,523:524])
In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
In [50]:
dtc = DecisionTreeClassifier()
gbm = GradientBoostingClassifier()
rfc = RandomForestClassifier()

dtc.fit(X_train, y_train)
gbm.fit(X_train, y_train)
rfc.fit(X_train, y_train)

dtc_pred = dtc.predict(X_val)
gbm_pred = gbm.predict(X_val)
rfc_pred = rfc.predict(X_val)


print("DTC Validation set Accuracy: ", round(metrics.accuracy_score(y_val, dtc_pred),4))
print("GBM Validation set Accuracy: ", round(metrics.accuracy_score(y_val, gbm_pred),4))
print("RFC Validation set Accuracy: ", round(metrics.accuracy_score(y_val, rfc_pred),4))
DTC Validation set Accuracy:  0.7975
GBM Validation set Accuracy:  0.8767
RFC Validation set Accuracy:  0.9091

Random Forest Classiffier seems to be our best model in order to predict the floor position of a person, next, we'll try to predict the longitude and latitude.

Longitude¶

RFR¶

In [51]:
validation.insert(521, 'FLOOR_PRED', rfc_pred)
df.insert(521, 'FLOOR', df.pop('FLOOR'))
In [52]:
X = np.asarray(df.iloc[:,0:522])
y = np.asarray(df['LONGITUDE'])
X_val = np.asarray(validation.iloc[:,0:522])
y_val = np.asarray(validation['LONGITUDE'])
In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
In [54]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'meters.')
Mean Absolute Error: 2.13 meters.
In [55]:
predictions = rf.predict(X_val)
# Calculate the absolute errors
errors = abs(predictions - y_val)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'meters.')
Mean Absolute Error: 7.71 meters.

Latitude¶

RFR¶

In [56]:
validation.insert(522, 'LONGITUDE_PRED', predictions)
df.insert(522, 'LONGITUDE', df.pop('LONGITUDE'))
In [57]:
X = np.asarray(df.iloc[:,0:523])
y = np.asarray(df['LATITUDE'])
X_val = np.asarray(validation.iloc[:,0:523])
y_val = np.asarray(validation['LATITUDE'])
In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 123, shuffle=True)
In [59]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'meters.')
Mean Absolute Error: 1.15 meters.
In [60]:
predictions = rf.predict(X_val)
# Calculate the absolute errors
errors = abs(predictions - y_val)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'meters.')
Mean Absolute Error: 7.73 meters.

Evaluation¶

In [61]:
validation.insert(523, 'LATITUDE_PRED', predictions)
df.insert(523, 'LATITUDE', df.pop('LATITUDE'))
In [62]:
MAE = np.mean(abs(np.sqrt(((validation['LONGITUDE'] - validation['LONGITUDE_PRED'])) **2 
                    + (validation['LATITUDE'] - validation['LATITUDE_PRED']) **2)))

print('Accurate between a range of', round(MAE,2), 'meters.')
Accurate between a range of 12.14 meters.

It looks like our cascade method if giving us decent results, our prediction has an accuracy of around 100% when determing the building the person is, 91% when determine the floor in which person is, and is also accurate in a range of around 12 meters when determing its position.

Conclusions¶

  • Based on the data available, we can predict the building by using only the WAPs fingerprint and then start our cascade process to determine the floor, longitude and latitude of a person's position.

  • Random Forest (Classifier and Regressor) seems to be our go to models to try and determine the position of a person with the best possible precision.

  • Our model ends up been accurate in a range of around 12 meters, we believe is a good result and that it should be implemented in the new app.